library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
data = read.csv("data- originalchannel4 bigdata.csv", header = T, fill = T) #758916 obs
cat("the total number of IDs in this file is", nrow(data))
## the total number of IDs in this file is 758916

Section 1: Data cleaning and generating variables

data cleanup and recoding variables

#remove_repeats
data2 = subset(data, repeat. == 0) #695166

cat("the number of non-repeated participants is", nrow(data2))
## the number of non-repeated participants is 695166
#keep males of females
data2 = subset(data2, sex == 1 | sex == 2)#672279

cat("\n")
cat("Of this, the number of participants who are either male or female is",  nrow(data2))
## Of this, the number of participants who are either male or female is 672279
##keep age bound

data2 = subset(data2, age > 15 & age < 90)

cat("\n")
cat("Of this, the number participants after all QC is ",  nrow(data2))
## Of this, the number participants after all QC is  671606

Recoding variables

#recode AQ
data2[,c(51,57,58,60)] <- lapply(data2[,c(51,57,58,60)], function(x) 
  recode(x,"1" = 1, "2" = 1, "3" = 0, "4" =0 ))

data2[,c(52,53,54,55,56,59)] <- lapply(data2[,c(52,53,54,55,56,59)], function(x) 
   recode(x,"1" = 0, "2" = 0, "3" = 1, "4" =1))

data2$AQ_full = data2$AQ_1 + data2$AQ_2 + data2$AQ_3 + data2$AQ_4 + data2$AQ_5 + data2$AQ_6 + data2$AQ_7 + data2$AQ_8 + data2$AQ_9 + data2$AQ_10

data2$AQ_Z = scale(data2$AQ_full, center = TRUE, scale = TRUE)


#Recode EQ
data2[,c(31, 32, 34, 36, 39)] <- lapply(data2[,c(31, 32, 34, 36, 39)], function(x) 
  recode(x,"1" = 2, "2" = 1, "3" = 0, "4" =0 ))

data2[,c(33, 35, 37, 38, 40 )] <- lapply(data2[,c(33, 35, 37, 38, 40)], function(x) 
  recode(x,"1" = 0, "2" = 0, "3" = 1, "4" =2))

data2$EQ_full = data2$EQ_1 + data2$EQ_2 + data2$EQ_3 + data2$EQ_4 + data2$EQ_5 + data2$EQ_6 + data2$EQ_7 + data2$EQ_8 + data2$EQ_9 + data2$EQ_10

data2$EQ_Z = scale(data2$EQ_full, center = TRUE, scale = TRUE)


#Recode SQ
data2[,c(41,43, 44, 46, 47, 49, 50)] <- lapply(data2[,c(41,43, 44, 46, 47, 49, 50)], function(x) 
  recode(x,"1" = 2, "2" = 1, "3" = 0, "4" =0 ))

data2[,c(42, 45, 48 )] <- lapply(data2[,c(42, 45, 48)], function(x) 
  recode(x,"1" = 0, "2" = 0, "3" = 1, "4" =2))

data2$SQ_full = data2$SQR_1 + data2$SQR_2 + data2$SQR_3 + data2$SQR_4 + data2$SQR_5 + data2$SQR_6 + data2$SQR_7 + data2$SQR_8 + data2$SQR_9 + data2$SQR_10

data2$SQ_Z = scale(data2$SQ_full, center = TRUE, scale = TRUE)


#Recode SPQ

data2[,c(21:30)] <- lapply(data2[,c(21:30)], function(x) 
  recode(x,"1" = 3, "2" = 2, "3" = 1, "4" =0 ))

data2$SPQ_full = data2$SPQ_1 + data2$SPQ_2 + data2$SPQ_3 + data2$SPQ_4 + data2$SPQ_5 + data2$SPQ_6 + data2$SPQ_7 + data2$SPQ_8 + data2$SPQ_9 + data2$SPQ_10


data2$SPQ_Z = scale(data2$SPQ_full, center = TRUE, scale = TRUE)

data2 = data2[!is.na(data2$AQ_full),]

Defining cases

#Define cases

#define cases based on all different options
data2$autism = ifelse(data2$diagnosis_0 == "2" | data2$diagnosis_1 == "2"| data2$diagnosis_3 == "2" | data2$diagnosis_4 == "2" | data2$diagnosis_5 == "2" | data2$diagnosis_6 == "2" | data2$diagnosis_7 == "2" | data2$diagnosis_8 == "2" | data2$autism_diagnosis_0 > 0 | data2$autism_diagnosis_1 > 0 | data2$autism_diagnosis_2 > 0, 1, 0)

data2$autism[is.na(data2$autism)] <- 0


#define cases based only on the autism criteria
data2$autism2 = ifelse(data2$autism_diagnosis_0 > 0 | data2$autism_diagnosis_1 > 0 | data2$autism_diagnosis_2 > 0, 1, 0)

data2$autism2[is.na(data2$autism2)] <- 0

#count the number of cases


a =  nrow(subset(data2, autism == 1))

cat("the number of autism cases is (broad criteria)", a)
## the number of autism cases is (broad criteria) 36648
cat("\n")
b = nrow(subset(data2, autism2 == 1))

cat("the number of autism cases is (narrow criteria)", b)
## the number of autism cases is (narrow criteria) 36426
#define schizophrenia
data2$schizophrenia = ifelse(data2$diagnosis_0 == "7" | data2$diagnosis_1 == "7"| data2$diagnosis_3 == "7" | data2$diagnosis_4 == "7" | data2$diagnosis_5 == "7" | data2$diagnosis_6 == "7" | data2$diagnosis_7 == "7" | data2$diagnosis_8 == "7" | data2$autism_diagnosis_0 == "0" | data2$autism_diagnosis_1 == "0" | data2$autism_diagnosis_2 == "0", 1, 0)

data2$schizophrenia[is.na(data2$schizophrenia)] <- 0

#define ocd
data2$ocd = ifelse(data2$diagnosis_0 == "6" | data2$diagnosis_1 == "6"| data2$diagnosis_3 == "6" | data2$diagnosis_4 == "6" | data2$diagnosis_5 == "6" | data2$diagnosis_6 == "6" | data2$diagnosis_7 == "6" | data2$diagnosis_8 == "6" | data2$autism_diagnosis_0 == "0" | data2$autism_diagnosis_1 == "0" | data2$autism_diagnosis_2 == "0", 1, 0)

data2$ocd[is.na(data2$ocd)] <- 0


#define ld
data2$ld = ifelse(data2$diagnosis_0 == "5" | data2$diagnosis_1 == "5"| data2$diagnosis_3 == "5" | data2$diagnosis_4 == "5" | data2$diagnosis_5 == "5" | data2$diagnosis_6 == "5" | data2$diagnosis_7 == "5" | data2$diagnosis_8 == "5" | data2$autism_diagnosis_0 == "0" | data2$autism_diagnosis_1 == "0" | data2$autism_diagnosis_2 == "0", 1, 0)

data2$ld[is.na(data2$ld)] <- 0

#define depression
data2$depression = ifelse(data2$diagnosis_0 == "4" | data2$diagnosis_1 == "4"| data2$diagnosis_3 == "4" | data2$diagnosis_4 == "4" | data2$diagnosis_5 == "4" | data2$diagnosis_6 == "4" | data2$diagnosis_7 == "4" | data2$diagnosis_8 == "4" | data2$autism_diagnosis_0 == "0" | data2$autism_diagnosis_1 == "0" | data2$autism_diagnosis_2 == "0", 1, 0)

data2$depression[is.na(data2$depression)] <- 0


#define bipolar
data2$bipolar = ifelse(data2$diagnosis_0 == "3" | data2$diagnosis_1 == "3"| data2$diagnosis_3 == "3" | data2$diagnosis_4 == "3" | data2$diagnosis_5 == "3" | data2$diagnosis_6 == "3" | data2$diagnosis_7 == "3" | data2$diagnosis_8 == "3" | data2$autism_diagnosis_0 == "0" | data2$autism_diagnosis_1 == "0" | data2$autism_diagnosis_2 == "0", 1, 0)

data2$bipolar[is.na(data2$bipolar)] <- 0

#define adhd
data2$adhd = ifelse(data2$diagnosis_0 == "1" | data2$diagnosis_1 == "1"| data2$diagnosis_3 == "1" | data2$diagnosis_4 == "1" | data2$diagnosis_5 == "1" | data2$diagnosis_6 == "1" | data2$diagnosis_7 == "1" | data2$diagnosis_8 == "6" | data2$autism_diagnosis_0 == "0" | data2$autism_diagnosis_1 == "0" | data2$autism_diagnosis_2 == "0", 1, 0)

data2$ocd[is.na(data2$ocd)] <- 0

Defining other variables

#Define other variables

data2$STEM = ifelse(data2$occupation  == "3" | data2$occupation == "5" | data2$occupation == "21" , 1, 0)
data2$STEM[is.na(data2$STEM)] <- 0
data2$STEM = ifelse(data2$occupation == "26", 0, data2$STEM)

STEM = subset(data2, STEM == "1")

cat("No of participants in STEM occupation is", nrow(STEM))
## No of participants in STEM occupation is 76267
cat("\n")
###update country region
data2$countryregion = ifelse(data2$countryregion == "14", 0, data2$countryregion)

###update education

data2$education = ifelse(data2$education == "5", 0, data2$education)

###update handedness

data2$handedness = ifelse(data2$handedness == "4", 0, data2$handedness)

Generating D-scores and brain types

#This is Wheelwright's method: https://www.ncbi.nlm.nih.gov/pubmed/16473340

controls = subset(data2, autism == "0")
cases = subset(data2, autism == "1")

meanSQ = mean(controls$SQ_full)
meanEQ = mean(controls$EQ_full)

data2$SQ_full_standardized_w = (data2$SQ_full - meanSQ)/20
data2$EQ_full_standardized_w = (data2$EQ_full - meanEQ)/20
data2$wheelwrightD = data2$SQ_full_standardized_w - data2$EQ_full_standardized_w

data2$dpercentile = ntile(data2$wheelwrightD, 100)

data2$braintype = ifelse(data2$dpercentile < 2.5, "1",
                 ifelse(between(data2$dpercentile, 2.499, 35), "2",
                    ifelse(between(data2$dpercentile, 34.99, 65), "3", 
                           ifelse(between(data2$dpercentile, 64.99 , 97.5), "4", "5"))))

controls = subset(data2, autism == "0") #redefining cases and controls again so downstream analysis can include braintype
cases = subset(data2, autism == "1")

seperating by sex

males_controls = subset(controls, sex == "1")
females_controls = subset(controls, sex == "2")

dim(males_controls)
## [1] 241355     82
dim(females_controls)
## [1] 393600     82
males_cases = subset(cases, sex == "1")
females_cases = subset(cases, sex == "2")

dim(males_cases)
## [1] 18188    82
dim(females_cases)
## [1] 18460    82

Section 2: Sex, age and geographical location differences

Sex differences in controls

#Basic statistics

# Sex differences
t.test(controls$AQ_full ~ controls$sex)
## 
##  Welch Two Sample t-test
## 
## data:  controls$AQ_full by controls$sex
## t = 68.682, df = 501320, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.3897157 0.4126117
## sample estimates:
## mean in group 1 mean in group 2 
##        3.570429        3.169266
ggplot(controls, aes(x=AQ_full, colour=as.character(sex))) +   geom_density() + theme_minimal()

t.test(controls$EQ_full ~ controls$sex)
## 
##  Welch Two Sample t-test
## 
## data:  controls$EQ_full by controls$sex
## t = -155.09, df = 518030, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.945568 -1.897007
## sample estimates:
## mean in group 1 mean in group 2 
##        8.875432       10.796720
ggplot(controls, aes(x=EQ_full, colour=as.character(sex))) +   geom_density() + theme_minimal()

t.test(controls$SQ_full ~ controls$sex)
## 
##  Welch Two Sample t-test
## 
## data:  controls$SQ_full by controls$sex
## t = 122.11, df = 481110, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.263506 1.304729
## sample estimates:
## mean in group 1 mean in group 2 
##        6.734478        5.450361
ggplot(controls, aes(x=SQ_full, colour=as.character(sex))) +   geom_density() + theme_minimal()

t.test(controls$SPQ_full ~ controls$sex)
## 
##  Welch Two Sample t-test
## 
## data:  controls$SPQ_full by controls$sex
## t = -57.099, df = 526580, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.8558141 -0.7990105
## sample estimates:
## mean in group 1 mean in group 2 
##        13.99455        14.82196
ggplot(controls, aes(x=SPQ_full, colour=as.character(sex))) +   geom_density() + theme_minimal()

controls2 = controls[,c ("sex", "AQ_full", "EQ_full", "SQ_full", "SPQ_full" )]
controls2 = na.omit(controls2)
controls2_melt =  melt(controls2, id.vars=c("sex"))
ddply(controls2_melt, c("sex", "variable"), summarise,
      mean = mean(value), sd = sd(value),
      sem = sd(value)/sqrt(length(value)))
##   sex variable      mean       sd         sem
## 1   1  AQ_full  3.570429 2.278934 0.004638778
## 2   1  EQ_full  8.875432 4.756196 0.009681254
## 3   1  SQ_full  6.734478 4.180116 0.008508640
## 4   1 SPQ_full 13.994552 5.515901 0.011227636
## 5   2  AQ_full  3.169266 2.226789 0.003549372
## 6   2  EQ_full 10.796720 4.849119 0.007729213
## 7   2  SQ_full  5.450361 3.877523 0.006180545
## 8   2 SPQ_full 14.821964 5.747510 0.009161196

Sex differences in cases

t.test(cases$AQ_full ~ cases$sex)
## 
##  Welch Two Sample t-test
## 
## data:  cases$AQ_full by cases$sex
## t = 7.4906, df = 36638, p-value = 7.008e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1561934 0.2669022
## sample estimates:
## mean in group 1 mean in group 2 
##        4.875632        4.664085
ggplot(cases, aes(x=AQ_full, colour=as.character(sex))) +   geom_density() + theme_minimal()

t.test(cases$EQ_full ~ cases$sex)
## 
##  Welch Two Sample t-test
## 
## data:  cases$EQ_full by cases$sex
## t = -26.415, df = 36550, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.444680 -1.245096
## sample estimates:
## mean in group 1 mean in group 2 
##        6.920497        8.265385
ggplot(cases, aes(x=EQ_full, colour=as.character(sex))) +   geom_density() + theme_minimal()

t.test(cases$SQ_full ~ cases$sex)
## 
##  Welch Two Sample t-test
## 
## data:  cases$SQ_full by cases$sex
## t = 20.866, df = 36441, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.8907598 1.0754565
## sample estimates:
## mean in group 1 mean in group 2 
##        8.077854        7.094745
ggplot(cases, aes(x=SQ_full, colour=as.character(sex))) +   geom_density() + theme_minimal()

t.test(cases$SPQ_full ~ cases$sex)
## 
##  Welch Two Sample t-test
## 
## data:  cases$SPQ_full by cases$sex
## t = -11.968, df = 36607, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.9046750 -0.6500477
## sample estimates:
## mean in group 1 mean in group 2 
##        16.33055        17.10791
ggplot(cases, aes(x=SPQ_full, colour=as.character(sex))) +   geom_density() + theme_minimal()

cases2 = cases[,c ("sex", "AQ_full", "EQ_full", "SQ_full", "SPQ_full" )]
cases2 = na.omit(cases2)
cases2_melt =  melt(cases2, id.vars=c("sex"))
ddply(cases2_melt, c("sex", "variable"), summarise,
      mean = mean(value), sd = sd(value),
      sem = sd(value)/sqrt(length(value)))
##   sex variable      mean       sd        sem
## 1   1  AQ_full  4.875632 2.662900 0.01974524
## 2   1  EQ_full  6.920497 4.710717 0.03492967
## 3   1  SQ_full  8.077854 4.642268 0.03442213
## 4   1 SPQ_full 16.330548 6.272512 0.04651029
## 5   2  AQ_full  4.664085 2.743430 0.02019194
## 6   2  EQ_full  8.265385 5.032808 0.03704201
## 7   2  SQ_full  7.094745 4.371088 0.03217168
## 8   2 SPQ_full 17.107909 6.160577 0.04534251

Quantifying differences in effect sizes

summary(lm(AQ_full ~ as.character(sex) + as.character(autism) + as.character(sex):as.character(autism), data = data2))
## 
## Call:
## lm(formula = AQ_full ~ as.character(sex) + as.character(autism) + 
##     as.character(sex):as.character(autism), data = data2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8756 -1.5704 -0.1693  1.4296  6.8307 
## 
## Coefficients:
##                                           Estimate Std. Error t value
## (Intercept)                               3.570429   0.004629 771.342
## as.character(sex)2                       -0.401164   0.005879 -68.234
## as.character(autism)1                     1.305203   0.017486  74.644
## as.character(sex)2:as.character(autism)1  0.189616   0.024475   7.747
##                                          Pr(>|t|)    
## (Intercept)                               < 2e-16 ***
## as.character(sex)2                        < 2e-16 ***
## as.character(autism)1                     < 2e-16 ***
## as.character(sex)2:as.character(autism)1  9.4e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.274 on 671599 degrees of freedom
## Multiple R-squared:  0.02719,    Adjusted R-squared:  0.02718 
## F-statistic:  6257 on 3 and 671599 DF,  p-value: < 2.2e-16
summary(lm(EQ_full ~ as.character(sex) + as.character(autism) + as.character(sex):as.character(autism), data = data2))
## 
## Call:
## lm(formula = EQ_full ~ as.character(sex) + as.character(autism) + 
##     as.character(sex):as.character(autism), data = data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.7967  -3.7967   0.2033   3.2033  13.0795 
## 
## Coefficients:
##                                           Estimate Std. Error t value
## (Intercept)                               8.875432   0.009806  905.12
## as.character(sex)2                        1.921288   0.012455  154.26
## as.character(autism)1                    -1.954935   0.037042  -52.78
## as.character(sex)2:as.character(autism)1 -0.576400   0.051848  -11.12
##                                          Pr(>|t|)    
## (Intercept)                                <2e-16 ***
## as.character(sex)2                         <2e-16 ***
## as.character(autism)1                      <2e-16 ***
## as.character(sex)2:as.character(autism)1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.817 on 671599 degrees of freedom
## Multiple R-squared:  0.04766,    Adjusted R-squared:  0.04765 
## F-statistic: 1.12e+04 on 3 and 671599 DF,  p-value: < 2.2e-16
summary(lm(SQ_full ~ as.character(sex) + as.character(autism) + as.character(sex):as.character(autism), data = data2))
## 
## Call:
## lm(formula = SQ_full ~ as.character(sex) + as.character(autism) + 
##     as.character(sex):as.character(autism), data = data2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.0779 -3.0947 -0.4504  2.5496 14.5496 
## 
## Coefficients:
##                                           Estimate Std. Error  t value
## (Intercept)                               6.734478   0.008193  822.012
## as.character(sex)2                       -1.284117   0.010406 -123.406
## as.character(autism)1                     1.343375   0.030948   43.407
## as.character(sex)2:as.character(autism)1  0.301009   0.043319    6.949
##                                          Pr(>|t|)    
## (Intercept)                               < 2e-16 ***
## as.character(sex)2                        < 2e-16 ***
## as.character(autism)1                     < 2e-16 ***
## as.character(sex)2:as.character(autism)1 3.69e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.025 on 671599 degrees of freedom
## Multiple R-squared:  0.0311, Adjusted R-squared:  0.0311 
## F-statistic:  7186 on 3 and 671599 DF,  p-value: < 2.2e-16
summary(lm(SPQ_full ~ as.character(sex) + as.character(autism) + as.character(sex):as.character(autism), data = data2))
## 
## Call:
## lm(formula = SPQ_full ~ as.character(sex) + as.character(autism) + 
##     as.character(sex):as.character(autism), data = data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.1079  -3.9946   0.0054   4.0054  16.0054 
## 
## Coefficients:
##                                          Estimate Std. Error  t value
## (Intercept)                              13.99455    0.01159 1207.809
## as.character(sex)2                        0.82741    0.01472   56.223
## as.character(autism)1                     2.33600    0.04377   53.370
## as.character(sex)2:as.character(autism)1 -0.05005    0.06126   -0.817
##                                          Pr(>|t|)    
## (Intercept)                                <2e-16 ***
## as.character(sex)2                         <2e-16 ***
## as.character(autism)1                      <2e-16 ***
## as.character(sex)2:as.character(autism)1    0.414    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.692 on 671599 degrees of freedom
## Multiple R-squared:  0.01261,    Adjusted R-squared:  0.0126 
## F-statistic:  2859 on 3 and 671599 DF,  p-value: < 2.2e-16

Correlations between AQ, EQ, SQ, and SPQ in the controls data

cor.test(controls$AQ_full, controls$EQ_full)
## 
##  Pearson's product-moment correlation
## 
## data:  controls$AQ_full and controls$EQ_full
## t = -586.6, df = 634950, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5944322 -0.5912418
## sample estimates:
##        cor 
## -0.5928393
cor.test(controls$AQ_full, controls$SQ_full)
## 
##  Pearson's product-moment correlation
## 
## data:  controls$AQ_full and controls$SQ_full
## t = 363.1, df = 634950, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4126139 0.4166874
## sample estimates:
##       cor 
## 0.4146527
cor.test(controls$AQ_full, controls$SPQ_full)
## 
##  Pearson's product-moment correlation
## 
## data:  controls$AQ_full and controls$SPQ_full
## t = 288.44, df = 634950, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3381928 0.3425422
## sample estimates:
##       cor 
## 0.3403694
cor.test(controls$EQ_full, controls$SQ_full)
## 
##  Pearson's product-moment correlation
## 
## data:  controls$EQ_full and controls$SQ_full
## t = -176.6, df = 634950, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2187171 -0.2140280
## sample estimates:
##        cor 
## -0.2163738
cor.test(controls$EQ_full, controls$SPQ_full)
## 
##  Pearson's product-moment correlation
## 
## data:  controls$EQ_full and controls$SPQ_full
## t = -126.12, df = 634950, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1587309 -0.1539317
## sample estimates:
##        cor 
## -0.1563322
cor.test(controls$SQ_full, controls$SPQ_full)
## 
##  Pearson's product-moment correlation
## 
## data:  controls$SQ_full and controls$SPQ_full
## t = 425.13, df = 634950, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4687968 0.4726262
## sample estimates:
##       cor 
## 0.4707137

Age differences in controls

#Age differences
cor.test(controls$AQ_full, controls$age)
## 
##  Pearson's product-moment correlation
## 
## data:  controls$AQ_full and controls$age
## t = -32.633, df = 634950, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.0433738 -0.0384627
## sample estimates:
##        cor 
## -0.0409185
ggplot(controls, aes(AQ_full, age)) + geom_smooth(method = "lm") + theme_minimal()

cor.test(controls$EQ_full, controls$age)
## 
##  Pearson's product-moment correlation
## 
## data:  controls$EQ_full and controls$age
## t = 44.151, df = 634950, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.05287075 0.05777503
## sample estimates:
##        cor 
## 0.05532322
ggplot(controls, aes(EQ_full, age)) + geom_smooth(method = "lm") + theme_minimal()

cor.test(controls$SQ_full, controls$age)
## 
##  Pearson's product-moment correlation
## 
## data:  controls$SQ_full and controls$age
## t = 30.478, df = 634950, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.03576464 0.04067679
## sample estimates:
##        cor 
## 0.03822095
ggplot(controls, aes(SQ_full, age)) + geom_smooth(method = "lm") + theme_minimal()

cor.test(controls$SPQ_full, controls$age)
## 
##  Pearson's product-moment correlation
## 
## data:  controls$SPQ_full and controls$age
## t = 80.472, df = 634950, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.09804299 0.10291266
## sample estimates:
##       cor 
## 0.1004784
ggplot(controls, aes(SPQ_full, age)) + geom_smooth(method = "lm") + theme_minimal()

Age differences in cases

#Age differences
cor.test(cases$AQ_full, cases$age)
## 
##  Pearson's product-moment correlation
## 
## data:  cases$AQ_full and cases$age
## t = 22.515, df = 36646, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1066959 0.1268930
## sample estimates:
##       cor 
## 0.1168065
ggplot(cases, aes(AQ_full, age)) + geom_smooth(method = "lm") + theme_minimal()

cor.test(cases$EQ_full, cases$age)
## 
##  Pearson's product-moment correlation
## 
## data:  cases$EQ_full and cases$age
## t = -9.8091, df = 36646, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06137990 -0.04095703
## sample estimates:
##         cor 
## -0.05117382
ggplot(cases, aes(EQ_full, age)) + geom_smooth(method = "lm") + theme_minimal()

cor.test(cases$SQ_full, cases$age)
## 
##  Pearson's product-moment correlation
## 
## data:  cases$SQ_full and cases$age
## t = 26.126, df = 36646, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1251606 0.1452627
## sample estimates:
##       cor 
## 0.1352256
ggplot(cases, aes(SQ_full, age)) + geom_smooth(method = "lm") + theme_minimal()

cor.test(cases$SPQ_full, cases$age)
## 
##  Pearson's product-moment correlation
## 
## data:  cases$SPQ_full and cases$age
## t = 28.092, df = 36646, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1351556 0.1552004
## sample estimates:
##       cor 
## 0.1451929
ggplot(cases, aes(SPQ_full, age)) + geom_smooth(method = "lm") + theme_minimal()

Correlation with education, controls

#Correlation with education

cor.test(controls$AQ_full, controls$education)
## 
##  Pearson's product-moment correlation
## 
## data:  controls$AQ_full and controls$education
## t = -72.145, df = 634950, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.09260927 -0.08772992
## sample estimates:
##         cor 
## -0.09017014
ggplot(controls, aes(education, AQ_full)) + geom_smooth(method = "lm") + theme_minimal()
## Warning: Removed 3 rows containing non-finite values (stat_smooth).

cor.test(controls$EQ_full, controls$education)
## 
##  Pearson's product-moment correlation
## 
## data:  controls$EQ_full and controls$education
## t = 79.747, df = 634950, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.09714665 0.10201721
## sample estimates:
##        cor 
## 0.09958253
ggplot(controls, aes(education, EQ_full)) + geom_smooth(method = "lm") + theme_minimal()
## Warning: Removed 3 rows containing non-finite values (stat_smooth).

cor.test(controls$SQ_full, controls$education)
## 
##  Pearson's product-moment correlation
## 
## data:  controls$SQ_full and controls$education
## t = 10.739, df = 634950, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.01101702 0.01593548
## sample estimates:
##        cor 
## 0.01347633
ggplot(controls, aes(education, SQ_full)) + geom_smooth(method = "lm") + theme_minimal()
## Warning: Removed 3 rows containing non-finite values (stat_smooth).

cor.test(controls$SPQ_full, controls$education)
## 
##  Pearson's product-moment correlation
## 
## data:  controls$SPQ_full and controls$education
## t = -60.811, df = 634950, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.07853897 -0.07364810
## sample estimates:
##         cor 
## -0.07609399
ggplot(controls, aes(education, SPQ_full)) + geom_smooth(method = "lm") + theme_minimal()
## Warning: Removed 3 rows containing non-finite values (stat_smooth).

Correlation with education, cases

#Correlation with education

cor.test(cases$AQ_full, cases$education)
## 
##  Pearson's product-moment correlation
## 
## data:  cases$AQ_full and cases$education
## t = -9.6043, df = 36646, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06031546 -0.03989038
## sample estimates:
##         cor 
## -0.05010816
ggplot(cases, aes(education, AQ_full)) + geom_smooth(method = "lm") + theme_minimal()

cor.test(cases$EQ_full, cases$education)
## 
##  Pearson's product-moment correlation
## 
## data:  cases$EQ_full and cases$education
## t = 16.144, df = 36646, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.07385792 0.09418983
## sample estimates:
##        cor 
## 0.08403262
ggplot(cases, aes(education, EQ_full)) + geom_smooth(method = "lm") + theme_minimal()

cor.test(cases$SQ_full, cases$education)
## 
##  Pearson's product-moment correlation
## 
## data:  cases$SQ_full and cases$education
## t = 4.2136, df = 36646, p-value = 2.519e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.01177013 0.03223670
## sample estimates:
##        cor 
## 0.02200572
ggplot(cases, aes(education, SQ_full)) + geom_smooth(method = "lm") + theme_minimal()

cor.test(cases$SPQ_full, cases$education)
## 
##  Pearson's product-moment correlation
## 
## data:  cases$SPQ_full and cases$education
## t = -11.977, df = 36646, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.07263359 -0.05223692
## sample estimates:
##         cor 
## -0.06244177
ggplot(cases, aes(education, SPQ_full)) + geom_smooth(method = "lm") + theme_minimal()

Geographical differences in the four measures, controls

m = aov(data=controls, AQ_full~as.character(countryregion))
anova(m)
## Analysis of Variance Table
## 
## Response: AQ_full
##                                 Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.character(countryregion)     13    3485  268.06  52.762 < 2.2e-16 ***
## Residuals                   634938 3225759    5.08                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m = aov(data=controls, EQ_full~as.character(countryregion))
anova(m)
## Analysis of Variance Table
## 
## Response: EQ_full
##                                 Df   Sum Sq Mean Sq F value    Pr(>F)    
## as.character(countryregion)     13    19225 1478.85  61.581 < 2.2e-16 ***
## Residuals                   634938 15247838   24.01                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m = aov(data=controls, SQ_full~as.character(countryregion))
anova(m)
## Analysis of Variance Table
## 
## Response: SQ_full
##                                 Df   Sum Sq Mean Sq F value    Pr(>F)    
## as.character(countryregion)     13    90279  6944.5  428.45 < 2.2e-16 ***
## Residuals                   634938 10291501    16.2                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m = aov(data=controls, SPQ_full~as.character(countryregion))
anova(m)
## Analysis of Variance Table
## 
## Response: SPQ_full
##                                 Df   Sum Sq Mean Sq F value    Pr(>F)    
## as.character(countryregion)     13    28654 2204.12  68.538 < 2.2e-16 ***
## Residuals                   634938 20419033   32.16                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
controls2 = controls[,c ("sex", "countryregion", "AQ_full", "EQ_full", "SQ_full", "SPQ_full" )]

controls2_melt =  melt(controls2, id.vars=c("sex", "countryregion"))

head(controls2_melt)
##   sex countryregion variable value
## 1   1             1  AQ_full     6
## 2   1             2  AQ_full     3
## 3   2             4  AQ_full     2
## 4   1             9  AQ_full     5
## 5   2             4  AQ_full     1
## 6   2             4  AQ_full     0
alpha = ddply(controls2_melt, c("sex", "countryregion", "variable"), summarise,
      mean = mean(value), sd = sd(value),
      sem = sd(value)/sqrt(length(value)))

alpha
##     sex countryregion variable      mean       sd         sem
## 1     1             0  AQ_full  3.537683 2.208153 0.020446734
## 2     1             0  EQ_full  8.940067 4.655561 0.043108896
## 3     1             0  SQ_full  7.012004 4.237871 0.039241232
## 4     1             0 SPQ_full 13.874904 5.699700 0.052777262
## 5     1             1  AQ_full  3.680491 2.305856 0.025541909
## 6     1             1  EQ_full  8.698896 4.654897 0.051562183
## 7     1             1  SQ_full  6.808712 4.195884 0.046477709
## 8     1             1 SPQ_full 14.383926 5.603363 0.062068318
## 9     1             2  AQ_full  3.514864 2.288303 0.017859935
## 10    1             2  EQ_full  8.884077 4.708515 0.036749402
## 11    1             2  SQ_full  6.534539 4.089856 0.031920842
## 12    1             2 SPQ_full 13.979593 5.421161 0.042311518
## 13    1             3  AQ_full  3.423891 2.170407 0.026308443
## 14    1             3  EQ_full  9.204672 4.650771 0.056374014
## 15    1             3  SQ_full  6.345137 4.009179 0.048597001
## 16    1             3 SPQ_full 14.071701 5.553679 0.067318559
## 17    1             4  AQ_full  3.392804 2.210797 0.014564272
## 18    1             4  EQ_full  9.366288 4.791003 0.031562137
## 19    1             4  SQ_full  6.472615 4.069815 0.026811099
## 20    1             4 SPQ_full 13.724546 5.402761 0.035592266
## 21    1             5  AQ_full  3.666031 2.307108 0.023115040
## 22    1             5  EQ_full  8.626079 4.732607 0.047416242
## 23    1             5  SQ_full  6.559526 4.118772 0.041266198
## 24    1             5 SPQ_full 14.408653 5.580591 0.055912244
## 25    1             6  AQ_full  3.655633 2.301357 0.017208363
## 26    1             6  EQ_full  8.706682 4.762788 0.035613671
## 27    1             6  SQ_full  6.633156 4.189445 0.031326511
## 28    1             6 SPQ_full 14.344031 5.543560 0.041451882
## 29    1             7  AQ_full  3.645297 2.298762 0.020276442
## 30    1             7  EQ_full  8.540885 4.734631 0.041762245
## 31    1             7  SQ_full  6.562281 4.126382 0.036397135
## 32    1             7 SPQ_full 14.144402 5.431490 0.047908963
## 33    1             8  AQ_full  3.757121 2.318135 0.022113608
## 34    1             8  EQ_full  8.634635 4.773139 0.045532870
## 35    1             8  SQ_full  6.702339 4.135881 0.039453809
## 36    1             8 SPQ_full 14.356993 5.457821 0.052064325
## 37    1             9  AQ_full  3.781260 2.357438 0.021552723
## 38    1             9  EQ_full  8.419592 4.775541 0.043660062
## 39    1             9  SQ_full  6.681377 4.109097 0.037567146
## 40    1             9 SPQ_full 14.335172 5.453089 0.049854505
## 41    1            10  AQ_full  3.646410 2.326516 0.013529229
## 42    1            10  EQ_full  8.748233 4.811933 0.027982506
## 43    1            10  SQ_full  6.580332 4.158461 0.024182410
## 44    1            10 SPQ_full 14.119847 5.457430 0.031736216
## 45    1            11  AQ_full  3.661345 2.315339 0.017390383
## 46    1            11  EQ_full  8.786698 4.781361 0.035912541
## 47    1            11  SQ_full  6.708338 4.167996 0.031305593
## 48    1            11 SPQ_full 14.214939 5.508773 0.041376094
## 49    1            12  AQ_full  3.463636 2.236946 0.009028093
## 50    1            12  EQ_full  9.045119 4.740131 0.019130702
## 51    1            12  SQ_full  7.057173 4.275507 0.017255526
## 52    1            12 SPQ_full 13.594905 5.543951 0.022374841
## 53    1            13  AQ_full  3.630198 2.330498 0.043024773
## 54    1            13  EQ_full  8.811861 4.757686 0.087834621
## 55    1            13  SQ_full  6.767212 4.264683 0.078732987
## 56    1            13 SPQ_full 14.244717 5.680921 0.104879030
## 57    1            NA  AQ_full  1.000000       NA          NA
## 58    1            NA  EQ_full  6.000000       NA          NA
## 59    1            NA  SQ_full  2.000000       NA          NA
## 60    1            NA SPQ_full  8.000000       NA          NA
## 61    2             0  AQ_full  3.354133 2.194253 0.018280996
## 62    2             0  EQ_full 10.497744 4.670793 0.038913815
## 63    2             0  SQ_full  6.152079 4.007419 0.033387043
## 64    2             0 SPQ_full 15.302700 5.876033 0.048955042
## 65    2             1  AQ_full  3.257416 2.234546 0.017905612
## 66    2             1  EQ_full 10.636317 4.780967 0.038310309
## 67    2             1  SQ_full  5.470720 3.861702 0.030944159
## 68    2             1 SPQ_full 15.017337 5.757003 0.046131375
## 69    2             2  AQ_full  3.099208 2.224139 0.012747717
## 70    2             2  EQ_full 10.895634 4.793870 0.027476208
## 71    2             2  SQ_full  5.238921 3.774540 0.021633888
## 72    2             2 SPQ_full 14.558654 5.720728 0.032788520
## 73    2             3  AQ_full  3.232320 2.171664 0.020651626
## 74    2             3  EQ_full 10.645325 4.723033 0.044914100
## 75    2             3  SQ_full  5.310816 3.790106 0.036042346
## 76    2             3 SPQ_full 14.963013 5.836393 0.055501689
## 77    2             4  AQ_full  2.967371 2.155629 0.012120991
## 78    2             4  EQ_full 11.279784 4.846684 0.027252657
## 79    2             4  SQ_full  5.180505 3.716554 0.020897991
## 80    2             4 SPQ_full 14.335715 5.665844 0.031858753
## 81    2             5  AQ_full  3.229662 2.201800 0.016474985
## 82    2             5  EQ_full 10.626617 4.800155 0.035917189
## 83    2             5  SQ_full  5.234645 3.781857 0.028297769
## 84    2             5 SPQ_full 14.983875 5.707441 0.042705964
## 85    2             6  AQ_full  3.197895 2.220610 0.012427967
## 86    2             6  EQ_full 10.731128 4.852364 0.027156957
## 87    2             6  SQ_full  5.212679 3.782165 0.021167435
## 88    2             6 SPQ_full 14.926831 5.695904 0.031877952
## 89    2             7  AQ_full  3.226686 2.235648 0.014662589
## 90    2             7  EQ_full 10.560392 4.851843 0.031821008
## 91    2             7  SQ_full  5.201222 3.735724 0.024500899
## 92    2             7 SPQ_full 14.767636 5.629790 0.036923203
## 93    2             8  AQ_full  3.185358 2.254743 0.015971816
## 94    2             8  EQ_full 10.712078 4.871450 0.034507660
## 95    2             8  SQ_full  5.255357 3.785719 0.026816718
## 96    2             8 SPQ_full 14.990115 5.721196 0.040526967
## 97    2             9  AQ_full  3.204341 2.265085 0.015075461
## 98    2             9  EQ_full 10.669103 4.933105 0.032832693
## 99    2             9  SQ_full  5.197209 3.765532 0.025061814
## 100   2             9 SPQ_full 14.900775 5.689103 0.037864300
## 101   2            10  AQ_full  3.063272 2.236186 0.009521378
## 102   2            10  EQ_full 11.009065 4.921571 0.020955381
## 103   2            10  SQ_full  5.071557 3.745460 0.015947662
## 104   2            10 SPQ_full 14.766239 5.728368 0.024390615
## 105   2            11  AQ_full  3.137046 2.237841 0.012649832
## 106   2            11  EQ_full 10.919479 4.888902 0.027635473
## 107   2            11  SQ_full  5.221817 3.768285 0.021300968
## 108   2            11 SPQ_full 14.826400 5.714130 0.032300236
## 109   2            12  AQ_full  3.234543 2.234428 0.007760020
## 110   2            12  EQ_full 10.696575 4.825640 0.016759131
## 111   2            12  SQ_full  6.179809 4.126811 0.014332142
## 112   2            12 SPQ_full 14.852828 5.839262 0.020279371
## 113   2            13  AQ_full  3.310419 2.225342 0.029774621
## 114   2            13  EQ_full 10.463480 4.856860 0.064983799
## 115   2            13  SQ_full  5.538489 3.820133 0.051112604
## 116   2            13 SPQ_full 15.204977 5.802020 0.077629854
## 117   2            NA  AQ_full  2.500000 2.121320 1.500000000
## 118   2            NA  EQ_full  8.000000 4.242641 3.000000000
## 119   2            NA  SQ_full  4.500000 2.121320 1.500000000
## 120   2            NA SPQ_full 12.500000 4.949747 3.500000000

Section 3: STEM differences

STEM differences

#STEM differences
t.test(controls$AQ_full ~ controls$STEM)
## 
##  Welch Two Sample t-test
## 
## data:  controls$AQ_full by controls$STEM
## t = -54.648, df = 89225, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.5285422 -0.4919420
## sample estimates:
## mean in group 0 mean in group 1 
##        3.263799        3.774041
ggplot(controls, aes(x=AQ_full, colour=as.character(STEM))) +   geom_density() + theme_minimal()

t.test(controls$EQ_full ~ controls$STEM)
## 
##  Welch Two Sample t-test
## 
## data:  controls$EQ_full by controls$STEM
## t = 83.145, df = 91180, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.578917 1.655154
## sample estimates:
## mean in group 0 mean in group 1 
##       10.250123        8.633087
ggplot(controls, aes(x=EQ_full, colour=as.character(STEM))) +   geom_density() + theme_minimal()

t.test(controls$SQ_full ~ controls$STEM)
## 
##  Welch Two Sample t-test
## 
## data:  controls$SQ_full by controls$STEM
## t = -102.73, df = 88314, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.776259 -1.709749
## sample estimates:
## mean in group 0 mean in group 1 
##        5.740506        7.483511
ggplot(controls, aes(x=SQ_full, colour=as.character(STEM))) +   geom_density() + theme_minimal()

t.test(controls$SPQ_full ~ controls$STEM)
## 
##  Welch Two Sample t-test
## 
## data:  controls$SPQ_full by controls$STEM
## t = 8.2666, df = 92725, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1386827 0.2248830
## sample estimates:
## mean in group 0 mean in group 1 
##        14.52819        14.34641
ggplot(controls, aes(x=SPQ_full, colour=as.character(STEM))) +   geom_density() + theme_minimal()

controls2 = controls[,c ("sex", "STEM", "AQ_full", "EQ_full", "SQ_full", "SPQ_full" )]
controls2 =  controls2[!is.na(controls2$AQ_full),]

controls2_melt =  melt(controls2, id.vars=c("sex", "STEM"))

head(controls2_melt)
##   sex STEM variable value
## 1   1    0  AQ_full     6
## 2   1    0  AQ_full     3
## 3   2    0  AQ_full     2
## 4   1    0  AQ_full     5
## 5   2    0  AQ_full     1
## 6   2    0  AQ_full     0
alpha = ddply(controls2_melt, c("sex", "STEM", "variable"), summarise,
      mean = mean(value), sd = sd(value),
      sem = sd(value)/sqrt(length(value)))

alpha
##    sex STEM variable      mean       sd         sem
## 1    1    0  AQ_full  3.489074 2.247115 0.005162604
## 2    1    0  EQ_full  9.069335 4.729977 0.010866822
## 3    1    0  SQ_full  6.467117 4.096856 0.009412267
## 4    1    0 SPQ_full 13.952211 5.522411 0.012687390
## 5    1    1  AQ_full  3.867619 2.367948 0.010395341
## 6    1    1  EQ_full  8.167322 4.784522 0.021004151
## 7    1    1  SQ_full  7.711205 4.333073 0.019022278
## 8    1    1 SPQ_full 14.149746 5.489321 0.024098230
## 9    1   NA  AQ_full  2.777778 1.986063 0.662020849
## 10   1   NA  EQ_full  9.555556 4.746344 1.582114541
## 11   1   NA  SQ_full  3.777778 2.488864 0.829621362
## 12   1   NA SPQ_full 10.555556 4.639804 1.546601212
## 13   2    0  AQ_full  3.149478 2.216301 0.003627252
## 14   2    0  EQ_full 10.849338 4.831171 0.007906810
## 15   2    0  SQ_full  5.371773 3.839555 0.006283908
## 16   2    0 SPQ_full 14.820487 5.752781 0.009415139
## 17   2    1  AQ_full  3.534249 2.383149 0.016747479
## 18   2    1  EQ_full  9.826609 5.072695 0.035648145
## 19   2    1  SQ_full  6.900044 4.269444 0.030003338
## 20   2    1 SPQ_full 14.850363 5.649177 0.039699353
## 21   2   NA  AQ_full  2.923077 1.497862 0.415432096
## 22   2   NA  EQ_full 10.769231 3.940259 1.092831222
## 23   2   NA  SQ_full  4.307692 2.393903 0.663949096
## 24   2   NA SPQ_full 13.000000 6.218253 1.724632997

STEM regression analyses

AQ analysis

summary(lm(AQ_full ~ STEM + as.character(countryregion) + as.character(handedness) + as.character(sex) + education + age, data = controls))
## 
## Call:
## lm(formula = AQ_full ~ STEM + as.character(countryregion) + as.character(handedness) + 
##     as.character(sex) + education + age, data = controls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0911 -1.7073 -0.3292  1.4041  7.4519 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    5.1579717  0.1045071  49.355  < 2e-16 ***
## STEM                           0.4564204  0.0091719  49.763  < 2e-16 ***
## as.character(countryregion)1   0.0698384  0.0200280   3.487 0.000488 ***
## as.character(countryregion)10 -0.0667375  0.0158391  -4.213 2.52e-05 ***
## as.character(countryregion)11 -0.0265650  0.0171152  -1.552 0.120633    
## as.character(countryregion)12 -0.0267816  0.0150413  -1.781 0.074987 .  
## as.character(countryregion)13  0.0430112  0.0278289   1.546 0.122212    
## as.character(countryregion)2  -0.0929391  0.0172554  -5.386 7.20e-08 ***
## as.character(countryregion)3  -0.0644923  0.0216668  -2.977 0.002915 ** 
## as.character(countryregion)4  -0.1702927  0.0168302 -10.118  < 2e-16 ***
## as.character(countryregion)5   0.0249753  0.0192328   1.299 0.194089    
## as.character(countryregion)6   0.0292916  0.0170738   1.716 0.086238 .  
## as.character(countryregion)7   0.0355810  0.0181426   1.961 0.049858 *  
## as.character(countryregion)8   0.0501540  0.0187756   2.671 0.007557 ** 
## as.character(countryregion)9   0.0550016  0.0183204   3.002 0.002680 ** 
## as.character(handedness)1     -1.0221598  0.1036819  -9.859  < 2e-16 ***
## as.character(handedness)2     -0.9591073  0.1039606  -9.226  < 2e-16 ***
## as.character(handedness)3     -0.3636422  0.1046563  -3.475 0.000512 ***
## as.character(sex)2            -0.2917310  0.0060607 -48.135  < 2e-16 ***
## education                     -0.2218523  0.0031463 -70.512  < 2e-16 ***
## age                           -0.0041805  0.0002339 -17.876  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.229 on 634902 degrees of freedom
##   (32 observations deleted due to missingness)
## Multiple R-squared:  0.02316,    Adjusted R-squared:  0.02313 
## F-statistic: 752.6 on 20 and 634902 DF,  p-value: < 2.2e-16
summary(lm(EQ_full ~ STEM + as.character(countryregion) + as.character(handedness) + as.character(sex) + education + age, data = data2))
## 
## Call:
## lm(formula = EQ_full ~ STEM + as.character(countryregion) + as.character(handedness) + 
##     as.character(sex) + education + age, data = data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.2265  -3.5727   0.1148   3.6218  13.7252 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    5.7854211  0.2103470  27.504  < 2e-16 ***
## STEM                          -1.1061495  0.0192079 -57.588  < 2e-16 ***
## as.character(countryregion)1  -0.1636512  0.0415229  -3.941 8.11e-05 ***
## as.character(countryregion)10  0.1065760  0.0326606   3.263 0.001102 ** 
## as.character(countryregion)11  0.0912821  0.0353718   2.581 0.009862 ** 
## as.character(countryregion)12 -0.0039324  0.0308816  -0.127 0.898672    
## as.character(countryregion)13 -0.1479070  0.0572900  -2.582 0.009831 ** 
## as.character(countryregion)2   0.1027536  0.0356874   2.879 0.003986 ** 
## as.character(countryregion)3   0.1274049  0.0448925   2.838 0.004540 ** 
## as.character(countryregion)4   0.4138088  0.0347594  11.905  < 2e-16 ***
## as.character(countryregion)5  -0.1251053  0.0398486  -3.140 0.001692 ** 
## as.character(countryregion)6  -0.1058326  0.0352688  -3.001 0.002693 ** 
## as.character(countryregion)7  -0.2394007  0.0375717  -6.372 1.87e-10 ***
## as.character(countryregion)8  -0.1368748  0.0388759  -3.521 0.000430 ***
## as.character(countryregion)9  -0.1880976  0.0379629  -4.955 7.24e-07 ***
## as.character(handedness)1      1.6051073  0.2088018   7.687 1.51e-14 ***
## as.character(handedness)2      1.4115858  0.2094017   6.741 1.57e-11 ***
## as.character(handedness)3      0.8017991  0.2108465   3.803 0.000143 ***
## as.character(sex)2             1.6823399  0.0126524 132.966  < 2e-16 ***
## education                      0.5519083  0.0065539  84.210  < 2e-16 ***
## age                            0.0093372  0.0004933  18.930  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.802 on 671548 degrees of freedom
##   (34 observations deleted due to missingness)
## Multiple R-squared:  0.05392,    Adjusted R-squared:  0.0539 
## F-statistic:  1914 on 20 and 671548 DF,  p-value: < 2.2e-16
summary(lm(SQ_full ~ STEM + as.character(countryregion) + as.character(handedness) + as.character(sex) + education + age, data = controls))
## 
## Call:
## lm(formula = SQ_full ~ STEM + as.character(countryregion) + as.character(handedness) + 
##     as.character(sex) + education + age, data = controls)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.0797  -2.9682  -0.7138   2.2849  15.3283 
## 
## Coefficients:
##                                 Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                    8.2518765  0.1844047   44.749   <2e-16 ***
## STEM                           1.2719874  0.0161840   78.596   <2e-16 ***
## as.character(countryregion)1  -0.5090883  0.0353398  -14.406   <2e-16 ***
## as.character(countryregion)10 -0.8891772  0.0279483  -31.815   <2e-16 ***
## as.character(countryregion)11 -0.7249772  0.0302001  -24.006   <2e-16 ***
## as.character(countryregion)12 -0.0110512  0.0265406   -0.416    0.677    
## as.character(countryregion)13 -0.4893560  0.0491047   -9.966   <2e-16 ***
## as.character(countryregion)2  -0.7509075  0.0304476  -24.662   <2e-16 ***
## as.character(countryregion)3  -0.6757224  0.0382315  -17.675   <2e-16 ***
## as.character(countryregion)4  -0.7651689  0.0296972  -25.766   <2e-16 ***
## as.character(countryregion)5  -0.7406650  0.0339366  -21.825   <2e-16 ***
## as.character(countryregion)6  -0.7354979  0.0301270  -24.413   <2e-16 ***
## as.character(countryregion)7  -0.7471001  0.0320129  -23.337   <2e-16 ***
## as.character(countryregion)8  -0.7088563  0.0331298  -21.396   <2e-16 ***
## as.character(countryregion)9  -0.7683960  0.0323266  -23.770   <2e-16 ***
## as.character(handedness)1     -1.9208224  0.1829486  -10.499   <2e-16 ***
## as.character(handedness)2     -1.8664223  0.1834404  -10.175   <2e-16 ***
## as.character(handedness)3      0.1132400  0.1846680    0.613    0.540    
## as.character(sex)2            -1.1133900  0.0106942 -104.112   <2e-16 ***
## education                      0.0084179  0.0055517    1.516    0.129    
## age                            0.0204007  0.0004127   49.438   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.933 on 634902 degrees of freedom
##   (32 observations deleted due to missingness)
## Multiple R-squared:  0.05396,    Adjusted R-squared:  0.05393 
## F-statistic:  1811 on 20 and 634902 DF,  p-value: < 2.2e-16
summary(lm(SPQ_full ~ STEM + as.character(countryregion) + as.character(handedness) + as.character(sex) + education + age, data = controls))
## 
## Call:
## lm(formula = SPQ_full ~ STEM + as.character(countryregion) + 
##     as.character(handedness) + as.character(sex) + education + 
##     age, data = controls)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.4199  -3.9949  -0.1624   3.8848  17.4793 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   15.5666565  0.2619189  59.433  < 2e-16 ***
## STEM                           0.2481976  0.0229869  10.797  < 2e-16 ***
## as.character(countryregion)1   0.0958396  0.0501949   1.909  0.05622 .  
## as.character(countryregion)10 -0.2595867  0.0396964  -6.539 6.19e-11 ***
## as.character(countryregion)11 -0.1330225  0.0428947  -3.101  0.00193 ** 
## as.character(countryregion)12 -0.2328654  0.0376969  -6.177 6.52e-10 ***
## as.character(countryregion)13  0.0461472  0.0697457   0.662  0.50820    
## as.character(countryregion)2  -0.2797712  0.0432461  -6.469 9.85e-11 ***
## as.character(countryregion)3   0.1082538  0.0543020   1.994  0.04620 *  
## as.character(countryregion)4  -0.3919889  0.0421804  -9.293  < 2e-16 ***
## as.character(countryregion)5   0.1045011  0.0482018   2.168  0.03016 *  
## as.character(countryregion)6   0.0680403  0.0427908   1.590  0.11182    
## as.character(countryregion)7  -0.0960455  0.0454695  -2.112  0.03466 *  
## as.character(countryregion)8   0.0372836  0.0470559   0.792  0.42817    
## as.character(countryregion)9  -0.0720362  0.0459151  -1.569  0.11667    
## as.character(handedness)1     -1.4155590  0.2598507  -5.448 5.11e-08 ***
## as.character(handedness)2     -1.4133260  0.2605493  -5.424 5.82e-08 ***
## as.character(handedness)3      1.4375280  0.2622929   5.481 4.24e-08 ***
## as.character(sex)2             0.7253028  0.0151895  47.750  < 2e-16 ***
## education                     -0.5520497  0.0078853 -70.010  < 2e-16 ***
## age                            0.0450394  0.0005861  76.845  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.586 on 634902 degrees of freedom
##   (32 observations deleted due to missingness)
## Multiple R-squared:  0.03098,    Adjusted R-squared:  0.03094 
## F-statistic:  1015 on 20 and 634902 DF,  p-value: < 2.2e-16

Section 4: Predicting AQ from other variables in controls

summary(lm(AQ_full ~ as.character(handedness) + as.character(sex) + as.character(countryregion) + education + age + as.character(STEM), data = controls ))
## 
## Call:
## lm(formula = AQ_full ~ as.character(handedness) + as.character(sex) + 
##     as.character(countryregion) + education + age + as.character(STEM), 
##     data = controls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0911 -1.7073 -0.3292  1.4041  7.4519 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    5.1579717  0.1045071  49.355  < 2e-16 ***
## as.character(handedness)1     -1.0221598  0.1036819  -9.859  < 2e-16 ***
## as.character(handedness)2     -0.9591073  0.1039606  -9.226  < 2e-16 ***
## as.character(handedness)3     -0.3636422  0.1046563  -3.475 0.000512 ***
## as.character(sex)2            -0.2917310  0.0060607 -48.135  < 2e-16 ***
## as.character(countryregion)1   0.0698384  0.0200280   3.487 0.000488 ***
## as.character(countryregion)10 -0.0667375  0.0158391  -4.213 2.52e-05 ***
## as.character(countryregion)11 -0.0265650  0.0171152  -1.552 0.120633    
## as.character(countryregion)12 -0.0267816  0.0150413  -1.781 0.074987 .  
## as.character(countryregion)13  0.0430112  0.0278289   1.546 0.122212    
## as.character(countryregion)2  -0.0929391  0.0172554  -5.386 7.20e-08 ***
## as.character(countryregion)3  -0.0644923  0.0216668  -2.977 0.002915 ** 
## as.character(countryregion)4  -0.1702927  0.0168302 -10.118  < 2e-16 ***
## as.character(countryregion)5   0.0249753  0.0192328   1.299 0.194089    
## as.character(countryregion)6   0.0292916  0.0170738   1.716 0.086238 .  
## as.character(countryregion)7   0.0355810  0.0181426   1.961 0.049858 *  
## as.character(countryregion)8   0.0501540  0.0187756   2.671 0.007557 ** 
## as.character(countryregion)9   0.0550016  0.0183204   3.002 0.002680 ** 
## education                     -0.2218523  0.0031463 -70.512  < 2e-16 ***
## age                           -0.0041805  0.0002339 -17.876  < 2e-16 ***
## as.character(STEM)1            0.4564204  0.0091719  49.763  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.229 on 634902 degrees of freedom
##   (32 observations deleted due to missingness)
## Multiple R-squared:  0.02316,    Adjusted R-squared:  0.02313 
## F-statistic: 752.6 on 20 and 634902 DF,  p-value: < 2.2e-16
summary(lm(AQ_full ~ as.character(handedness) + as.character(sex) + as.character(countryregion) + education + age + as.character(STEM) + wheelwrightD, data = controls ))
## 
## Call:
## lm(formula = AQ_full ~ as.character(handedness) + as.character(sex) + 
##     as.character(countryregion) + education + age + as.character(STEM) + 
##     wheelwrightD, data = controls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.3327 -1.1837 -0.1153  1.0918  8.0603 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    3.8220984  0.0793073  48.194  < 2e-16 ***
## as.character(handedness)1     -0.2916013  0.0786645  -3.707  0.00021 ***
## as.character(handedness)2     -0.2797688  0.0788750  -3.547  0.00039 ***
## as.character(handedness)3     -0.2139561  0.0793968  -2.695  0.00704 ** 
## as.character(sex)2             0.3079321  0.0046807  65.788  < 2e-16 ***
## as.character(countryregion)1   0.1453879  0.0151945   9.568  < 2e-16 ***
## as.character(countryregion)10  0.1473700  0.0120202  12.260  < 2e-16 ***
## as.character(countryregion)11  0.1480570  0.0129868  11.401  < 2e-16 ***
## as.character(countryregion)12 -0.0241577  0.0114109  -2.117  0.03425 *  
## as.character(countryregion)13  0.1186093  0.0211124   5.618 1.93e-08 ***
## as.character(countryregion)2   0.0872924  0.0130933   6.667 2.61e-11 ***
## as.character(countryregion)3   0.1017048  0.0164391   6.187 6.14e-10 ***
## as.character(countryregion)4   0.0783283  0.0127733   6.132 8.67e-10 ***
## as.character(countryregion)5   0.1547891  0.0145920  10.608  < 2e-16 ***
## as.character(countryregion)6   0.1648974  0.0129544  12.729  < 2e-16 ***
## as.character(countryregion)7   0.1413032  0.0137646  10.266  < 2e-16 ***
## as.character(countryregion)8   0.1751611  0.0142451  12.296  < 2e-16 ***
## as.character(countryregion)9   0.1785001  0.0138997  12.842  < 2e-16 ***
## education                     -0.1120334  0.0023923 -46.831  < 2e-16 ***
## age                           -0.0069696  0.0001775 -39.274  < 2e-16 ***
## as.character(STEM)1           -0.0580504  0.0069987  -8.295  < 2e-16 ***
## wheelwrightD                   4.3095952  0.0062979 684.288  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.691 on 634901 degrees of freedom
##   (32 observations deleted due to missingness)
## Multiple R-squared:  0.4378, Adjusted R-squared:  0.4378 
## F-statistic: 2.354e+04 on 21 and 634901 DF,  p-value: < 2.2e-16
summary(lm(AQ_full ~ as.character(handedness) + as.character(sex) + as.character(countryregion) + education + age + as.character(STEM) + wheelwrightD + SPQ_full, data = controls ))
## 
## Call:
## lm(formula = AQ_full ~ as.character(handedness) + as.character(sex) + 
##     as.character(countryregion) + education + age + as.character(STEM) + 
##     wheelwrightD + SPQ_full, data = controls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9192 -1.1747 -0.1214  1.0767  7.6657 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    3.2974485  0.0789539  41.764  < 2e-16 ***
## as.character(handedness)1     -0.2805772  0.0781194  -3.592 0.000329 ***
## as.character(handedness)2     -0.2657465  0.0783284  -3.393 0.000692 ***
## as.character(handedness)3     -0.2788516  0.0788495  -3.537 0.000405 ***
## as.character(sex)2             0.2436201  0.0046980  51.857  < 2e-16 ***
## as.character(countryregion)1   0.1371120  0.0150894   9.087  < 2e-16 ***
## as.character(countryregion)10  0.1445650  0.0119370  12.111  < 2e-16 ***
## as.character(countryregion)11  0.1427103  0.0128969  11.065  < 2e-16 ***
## as.character(countryregion)12 -0.0152637  0.0113322  -1.347 0.178002    
## as.character(countryregion)13  0.1122622  0.0209662   5.354 8.59e-08 ***
## as.character(countryregion)2   0.0873123  0.0130026   6.715 1.88e-11 ***
## as.character(countryregion)3   0.0874866  0.0163259   5.359 8.38e-08 ***
## as.character(countryregion)4   0.0785912  0.0126847   6.196 5.80e-10 ***
## as.character(countryregion)5   0.1429082  0.0144914   9.862  < 2e-16 ***
## as.character(countryregion)6   0.1540849  0.0128651  11.977  < 2e-16 ***
## as.character(countryregion)7   0.1386690  0.0136692  10.145  < 2e-16 ***
## as.character(countryregion)8   0.1661825  0.0141467  11.747  < 2e-16 ***
## as.character(countryregion)9   0.1738619  0.0138035  12.596  < 2e-16 ***
## education                     -0.0971885  0.0023809 -40.820  < 2e-16 ***
## age                           -0.0085524  0.0001770 -48.310  < 2e-16 ***
## as.character(STEM)1           -0.0367117  0.0069538  -5.279 1.30e-07 ***
## wheelwrightD                   4.0500268  0.0068331 592.708  < 2e-16 ***
## SPQ_full                       0.0388722  0.0004122  94.310  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.679 on 634900 degrees of freedom
##   (32 observations deleted due to missingness)
## Multiple R-squared:  0.4456, Adjusted R-squared:  0.4455 
## F-statistic: 2.319e+04 on 22 and 634900 DF,  p-value: < 2.2e-16
summary(lm(AQ_full ~ as.character(handedness) + as.character(sex) + as.character(countryregion) + education + age + as.character(STEM) + SQ_full + EQ_full + SPQ_full, data = controls ))
## 
## Call:
## lm(formula = AQ_full ~ as.character(handedness) + as.character(sex) + 
##     as.character(countryregion) + education + age + as.character(STEM) + 
##     SQ_full + EQ_full + SPQ_full, data = controls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.7792 -1.1605 -0.1011  1.0740  7.4496 
## 
## Coefficients:
##                                 Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                    4.6370193  0.0780697   59.396  < 2e-16 ***
## as.character(handedness)1     -0.3269993  0.0771272   -4.240 2.24e-05 ***
## as.character(handedness)2     -0.3154953  0.0773337   -4.080 4.51e-05 ***
## as.character(handedness)3     -0.2658667  0.0778472   -3.415 0.000637 ***
## as.character(sex)2             0.2208153  0.0046416   47.573  < 2e-16 ***
## as.character(countryregion)1   0.0963052  0.0149010    6.463 1.03e-10 ***
## as.character(countryregion)10  0.0948195  0.0117916    8.041 8.91e-16 ***
## as.character(countryregion)11  0.1008164  0.0127372    7.915 2.47e-15 ***
## as.character(countryregion)12 -0.0119638  0.0111882   -1.069 0.284925    
## as.character(countryregion)13  0.0743384  0.0207018    3.591 0.000330 ***
## as.character(countryregion)2   0.0462365  0.0128413    3.601 0.000317 ***
## as.character(countryregion)3   0.0450656  0.0161217    2.795 0.005185 ** 
## as.character(countryregion)4   0.0498160  0.0125255    3.977 6.97e-05 ***
## as.character(countryregion)5   0.0875827  0.0143137    6.119 9.43e-10 ***
## as.character(countryregion)6   0.1009163  0.0127083    7.941 2.01e-15 ***
## as.character(countryregion)7   0.0819372  0.0135027    6.068 1.29e-09 ***
## as.character(countryregion)8   0.1144383  0.0139726    8.190 2.61e-16 ***
## as.character(countryregion)9   0.1176131  0.0136351    8.626  < 2e-16 ***
## education                     -0.0678663  0.0023617  -28.736  < 2e-16 ***
## age                           -0.0077112  0.0001749  -44.089  < 2e-16 ***
## as.character(STEM)1            0.0004923  0.0068716    0.072 0.942885    
## SQ_full                        0.1371815  0.0006108  224.598  < 2e-16 ***
## EQ_full                       -0.2398276  0.0004455 -538.367  < 2e-16 ***
## SPQ_full                       0.0559626  0.0004282  130.696  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.658 on 634899 degrees of freedom
##   (32 observations deleted due to missingness)
## Multiple R-squared:  0.4596, Adjusted R-squared:  0.4595 
## F-statistic: 2.347e+04 on 23 and 634899 DF,  p-value: < 2.2e-16

Section 4: Case-control analyses

Case-control STEM

cases_STEM = nrow(subset(cases, STEM == "1"))

cases_nonSTEM = nrow(subset(cases, STEM == "0"))

controls_STEM = nrow(subset(controls, STEM == "1"))

controls_nonSTEM = nrow(subset(controls, STEM == "0"))

STEMdiff <- matrix(c(cases_STEM,controls_STEM,cases_nonSTEM,controls_nonSTEM), ncol=2)
colnames(STEMdiff) <- c('STEM', 'nonSTEM')
row.names(STEMdiff) <- c('Cases', 'Controls')

STEMdiff
##           STEM nonSTEM
## Cases     4130   32516
## Controls 72137  562796
chisq.test(STEMdiff) 
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  STEMdiff
## X-squared = 0.27831, df = 1, p-value = 0.5978
summary(glm(STEM ~ as.character(autism) + as.character(sex) + education + age, data = data2))
## 
## Call:
## glm(formula = STEM ~ as.character(autism) + as.character(sex) + 
##     education + age, data = data2)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.31325  -0.18744  -0.06259  -0.02206   1.05985  
## 
## Coefficients:
##                         Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)            9.989e-02  1.403e-03   71.193   <2e-16 ***
## as.character(autism)1 -3.750e-03  1.654e-03   -2.267   0.0234 *  
## as.character(sex)2    -1.675e-01  7.762e-04 -215.845   <2e-16 ***
## education              3.764e-02  4.127e-04   91.212   <2e-16 ***
## age                    7.219e-04  3.127e-05   23.085   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.09317407)
## 
##     Null deviance: 67606  on 671576  degrees of freedom
## Residual deviance: 62573  on 671572  degrees of freedom
##   (26 observations deleted due to missingness)
## AIC: 312016
## 
## Number of Fisher Scoring iterations: 2

Case-control AQ, EQ, SQ, and SPQ

males = subset(data2, sex == "1")
females = subset(data2, sex == "2")
t.test(males$AQ_full ~ males$autism)
## 
##  Welch Two Sample t-test
## 
## data:  males$AQ_full by males$autism
## t = -64.35, df = 20245, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.344959 -1.265447
## sample estimates:
## mean in group 0 mean in group 1 
##        3.570429        4.875632
ggplot(males, aes(x=AQ_full, colour=as.character(autism))) +   geom_density() + theme_minimal()

t.test(males$EQ_full ~ males$autism)
## 
##  Welch Two Sample t-test
## 
## data:  males$EQ_full by males$autism
## t = 53.934, df = 21079, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.883890 2.025981
## sample estimates:
## mean in group 0 mean in group 1 
##        8.875432        6.920497
ggplot(males, aes(x=EQ_full, colour=as.character(autism))) +   geom_density() + theme_minimal()

t.test(males$SQ_full ~ males$autism)
## 
##  Welch Two Sample t-test
## 
## data:  males$SQ_full by males$autism
## t = -37.886, df = 20472, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.412876 -1.273874
## sample estimates:
## mean in group 0 mean in group 1 
##        6.734478        8.077854
ggplot(males, aes(x=SQ_full, colour=as.character(autism))) +   geom_density() + theme_minimal()

t.test(males$SPQ_full ~ males$autism)
## 
##  Welch Two Sample t-test
## 
## data:  males$SPQ_full by males$autism
## t = -48.823, df = 20363, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.429779 -2.242213
## sample estimates:
## mean in group 0 mean in group 1 
##        13.99455        16.33055
ggplot(males, aes(x=SPQ_full, colour=as.character(autism))) +   geom_density() + theme_minimal()

t.test(females$AQ_full ~ females$autism)
## 
##  Welch Two Sample t-test
## 
## data:  females$AQ_full by females$autism
## t = -72.913, df = 19616, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.535003 -1.454634
## sample estimates:
## mean in group 0 mean in group 1 
##        3.169266        4.664085
ggplot(females, aes(x=AQ_full, colour=as.character(autism))) +   geom_density() + theme_minimal()

t.test(females$EQ_full ~ females$autism)
## 
##  Welch Two Sample t-test
## 
## data:  females$EQ_full by females$autism
## t = 66.896, df = 20100, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  2.457166 2.605505
## sample estimates:
## mean in group 0 mean in group 1 
##       10.796720        8.265385
ggplot(females, aes(x=EQ_full, colour=as.character(autism))) +   geom_density() + theme_minimal()

t.test(females$SQ_full ~ females$autism)
## 
##  Welch Two Sample t-test
## 
## data:  females$SQ_full by females$autism
## t = -50.195, df = 19845, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.708597 -1.580172
## sample estimates:
## mean in group 0 mean in group 1 
##        5.450361        7.094745
ggplot(females, aes(x=SQ_full, colour=as.character(autism))) +   geom_density() + theme_minimal()

t.test(females$SPQ_full ~ females$autism)
## 
##  Welch Two Sample t-test
## 
## data:  females$SPQ_full by females$autism
## t = -49.417, df = 19995, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.376616 -2.195274
## sample estimates:
## mean in group 0 mean in group 1 
##        14.82196        17.10791
ggplot(females, aes(x=SPQ_full, colour=as.character(autism))) +   geom_density() + theme_minimal()

data2melt = data2[,c ("sex", "autism", "AQ_full", "EQ_full", "SQ_full", "SPQ_full" )]
data2melt =  data2melt[!is.na(data2melt$AQ_full),]
data2melt =  melt(data2melt, id.vars=c("sex", "autism"))


head(data2melt)
##   sex autism variable value
## 1   2      1  AQ_full     6
## 2   1      1  AQ_full     6
## 3   1      0  AQ_full     6
## 4   1      0  AQ_full     3
## 5   2      1  AQ_full     1
## 6   2      0  AQ_full     2
ddply(data2melt, c("sex", "autism", "variable"), summarise,
      mean = mean(value), sd = sd(value),
      sem = sd(value)/sqrt(length(value)))
##    sex autism variable      mean       sd         sem
## 1    1      0  AQ_full  3.570429 2.278934 0.004638778
## 2    1      0  EQ_full  8.875432 4.756196 0.009681254
## 3    1      0  SQ_full  6.734478 4.180116 0.008508640
## 4    1      0 SPQ_full 13.994552 5.515901 0.011227636
## 5    1      1  AQ_full  4.875632 2.662900 0.019745239
## 6    1      1  EQ_full  6.920497 4.710717 0.034929673
## 7    1      1  SQ_full  8.077854 4.642268 0.034422132
## 8    1      1 SPQ_full 16.330548 6.272512 0.046510289
## 9    2      0  AQ_full  3.169266 2.226789 0.003549372
## 10   2      0  EQ_full 10.796720 4.849119 0.007729213
## 11   2      0  SQ_full  5.450361 3.877523 0.006180545
## 12   2      0 SPQ_full 14.821964 5.747510 0.009161196
## 13   2      1  AQ_full  4.664085 2.743430 0.020191941
## 14   2      1  EQ_full  8.265385 5.032808 0.037042007
## 15   2      1  SQ_full  7.094745 4.371088 0.032171680
## 16   2      1 SPQ_full 17.107909 6.160577 0.045342508

Case-control regression analyses**

summary(glm(as.numeric(as.character(autism)) ~ as.character(sex) + education + 
    AQ_full + AQ_full:education + AQ_full:as.character(sex) + as.character(sex):education + 
    as.character(countryregion) + as.character(handedness) + 
    age + age:education + AQ_full:age + age:as.character(sex) + STEM, data = data2))
## 
## Call:
## glm(formula = as.numeric(as.character(autism)) ~ as.character(sex) + 
##     education + AQ_full + AQ_full:education + AQ_full:as.character(sex) + 
##     as.character(sex):education + as.character(countryregion) + 
##     as.character(handedness) + age + age:education + AQ_full:age + 
##     age:as.character(sex) + STEM, data = data2)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.36922  -0.07468  -0.04092  -0.01579   1.06536  
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    1.692e-01  1.010e-02  16.759  < 2e-16 ***
## as.character(sex)2            -1.651e-02  2.259e-03  -7.307 2.74e-13 ***
## education                     -2.421e-02  1.000e-03 -24.201  < 2e-16 ***
## AQ_full                        2.946e-02  4.376e-04  67.313  < 2e-16 ***
## as.character(countryregion)1  -3.285e-02  1.925e-03 -17.061  < 2e-16 ***
## as.character(countryregion)10 -3.844e-02  1.515e-03 -25.371  < 2e-16 ***
## as.character(countryregion)11 -3.745e-02  1.640e-03 -22.830  < 2e-16 ***
## as.character(countryregion)12 -2.134e-02  1.431e-03 -14.908  < 2e-16 ***
## as.character(countryregion)13 -7.821e-03  2.655e-03  -2.946 0.003221 ** 
## as.character(countryregion)2  -3.865e-02  1.655e-03 -23.356  < 2e-16 ***
## as.character(countryregion)3  -3.212e-02  2.081e-03 -15.438  < 2e-16 ***
## as.character(countryregion)4  -3.342e-02  1.613e-03 -20.720  < 2e-16 ***
## as.character(countryregion)5  -3.635e-02  1.848e-03 -19.671  < 2e-16 ***
## as.character(countryregion)6  -3.504e-02  1.636e-03 -21.418  < 2e-16 ***
## as.character(countryregion)7  -3.929e-02  1.742e-03 -22.550  < 2e-16 ***
## as.character(countryregion)8  -3.304e-02  1.803e-03 -18.327  < 2e-16 ***
## as.character(countryregion)9  -3.930e-02  1.761e-03 -22.321  < 2e-16 ***
## as.character(handedness)1     -3.590e-02  9.677e-03  -3.710 0.000207 ***
## as.character(handedness)2     -2.917e-02  9.704e-03  -3.006 0.002651 ** 
## as.character(handedness)3     -1.045e-02  9.771e-03  -1.069 0.285010    
## age                           -2.834e-03  8.073e-05 -35.102  < 2e-16 ***
## STEM                          -5.718e-03  8.948e-04  -6.391 1.65e-10 ***
## education:AQ_full             -2.949e-03  1.301e-04 -22.668  < 2e-16 ***
## as.character(sex)2:AQ_full    -4.855e-03  2.459e-04 -19.745  < 2e-16 ***
## as.character(sex)2:education   6.418e-04  6.429e-04   0.998 0.318145    
## education:age                  6.184e-04  2.301e-05  26.870  < 2e-16 ***
## AQ_full:age                   -2.116e-04  1.002e-05 -21.130  < 2e-16 ***
## as.character(sex)2:age         6.782e-04  4.929e-05  13.759  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.0494977)
## 
##     Null deviance: 34646  on 671568  degrees of freedom
## Residual deviance: 33240  on 671541  degrees of freedom
##   (34 observations deleted due to missingness)
## AIC: -112761
## 
## Number of Fisher Scoring iterations: 2

Brain type analysis

Females control percentage

#Sex distribution

controls = subset(data2, autism == "0")
fem_controls = subset(controls, sex == "2")

cat("total number of females is", nrow(fem_controls))
## total number of females is 393600
extremeE = subset(fem_controls, braintype == "1")
E = subset(fem_controls, braintype == "2")
B = subset(fem_controls, braintype == "3")
S = subset(fem_controls, braintype == "4")
extremeS = subset(fem_controls, braintype == "5")

cat("percentage of females with extreme E is", (nrow(extremeE)/nrow(fem_controls))*100)
## percentage of females with extreme E is 2.893293
cat("percentage of females with E is", (nrow(E)/nrow(fem_controls))*100 )
## percentage of females with E is 40.00864
cat("percentage of females with B is", (nrow(B)/nrow(fem_controls))*100)
## percentage of females with B is 29.81326
cat("percentage of females with S is", (nrow(S)/nrow(fem_controls))*100)
## percentage of females with S is 25.58994
cat("percentage of females with Extreme S is", (nrow(extremeS)/nrow(fem_controls))*100)
## percentage of females with Extreme S is 1.694868

Brain type analysis

Males percentage

#Sex distribution

controls = subset(data2, autism == "0")
male_controls = subset(controls, sex == "1")

cat("total number of males is", nrow(male_controls))
## total number of males is 241355
extremeE = subset(male_controls, braintype == "1")
E = subset(male_controls, braintype == "2")
B = subset(male_controls, braintype == "3")
S = subset(male_controls, braintype == "4")
extremeS = subset(male_controls, braintype == "5")

cat("percentage of males with extreme E is", (nrow(extremeE)/nrow(male_controls))*100)
## percentage of males with extreme E is 0.7532473
cat("percentage of males with E is", (nrow(E)/nrow(male_controls))*100 )
## percentage of males with E is 23.87562
cat("percentage of males with B is", (nrow(B)/nrow(male_controls))*100)
## percentage of males with B is 30.98962
cat("percentage of males with S is", (nrow(S)/nrow(male_controls))*100)
## percentage of males with S is 40.23617
cat("percentage of males with Extreme S is", (nrow(extremeS)/nrow(male_controls))*100)
## percentage of males with Extreme S is 4.145346

Brain type analysis

Females cases percentage

#Sex distribution

cases = subset(data2, autism == "1")
fem_cases = subset(cases, sex == "2")

cat("total number of females is", nrow(fem_cases))
## total number of females is 18460
extremeE = subset(fem_cases, braintype == "1")
E = subset(fem_cases, braintype == "2")
B = subset(fem_cases, braintype == "3")
S = subset(fem_cases, braintype == "4")
extremeS = subset(fem_cases, braintype == "5")

cat("percentage of females with extreme E is", (nrow(extremeE)/nrow(fem_cases))*100)
## percentage of females with extreme E is 0.9317443
cat("percentage of females with E is", (nrow(E)/nrow(fem_cases))*100 )
## percentage of females with E is 22.19935
cat("percentage of females with B is", (nrow(B)/nrow(fem_cases))*100)
## percentage of females with B is 27.026
cat("percentage of females with S is", (nrow(S)/nrow(fem_cases))*100)
## percentage of females with S is 42.29144
cat("percentage of females with Extreme S is", (nrow(extremeS)/nrow(fem_cases))*100)
## percentage of females with Extreme S is 7.551463

Brain type analysis

Males cases percentage

#Sex distribution

cases = subset(data2, autism == "1")
male_cases = subset(cases, sex == "1")

cat("total number of males is", nrow(male_cases))
## total number of males is 18188
extremeE = subset(male_cases, braintype == "1")
E = subset(male_cases, braintype == "2")
B = subset(male_cases, braintype == "3")
S = subset(male_cases, braintype == "4")
extremeS = subset(male_cases, braintype == "5")

cat("percentage of females with extreme E is", (nrow(extremeE)/nrow(male_cases))*100)
## percentage of females with extreme E is 0.3023972
cat("percentage of females with E is", (nrow(E)/nrow(male_cases))*100 )
## percentage of females with E is 13.37145
cat("percentage of females with B is", (nrow(B)/nrow(male_cases))*100)
## percentage of females with B is 23.92237
cat("percentage of females with S is", (nrow(S)/nrow(male_cases))*100)
## percentage of females with S is 50.97867
cat("percentage of females with Extreme S is", (nrow(extremeS)/nrow(male_cases))*100)
## percentage of females with Extreme S is 11.42512

Anova with brain types

m = aov(data=controls, AQ_full~as.character(braintype))
anova(m)
## Analysis of Variance Table
## 
## Response: AQ_full
##                             Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.character(braintype)      4 1223971  305993   96889 < 2.2e-16 ***
## Residuals               634950 2005284       3                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD((m), "as.character(braintype)")
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = AQ_full ~ as.character(braintype), data = controls)
## 
## $`as.character(braintype)`
##          diff       lwr       upr p adj
## 2-1 0.9440355 0.9005766 0.9874945     0
## 3-1 2.1152719 2.0716630 2.1588808     0
## 4-1 3.8655434 3.8219748 3.9091119     0
## 5-1 6.3070507 6.2505831 6.3635184     0
## 3-2 1.1712363 1.1560195 1.1864531     0
## 4-2 2.9215078 2.9064071 2.9366085     0
## 5-2 5.3630152 5.3240484 5.4019820     0
## 4-3 1.7502715 1.7347446 1.7657984     0
## 5-3 4.1917788 4.1526449 4.2309128     0
## 5-4 2.4415074 2.4024184 2.4805963     0
m = aov(data=controls, SPQ_full~as.character(braintype))
anova(m)
## Analysis of Variance Table
## 
## Response: SPQ_full
##                             Df   Sum Sq Mean Sq F value    Pr(>F)    
## as.character(braintype)      4  2584807  646202   22970 < 2.2e-16 ***
## Residuals               634950 17862955      28                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD((m), "as.character(braintype)")
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = SPQ_full ~ as.character(braintype), data = controls)
## 
## $`as.character(braintype)`
##          diff       lwr       upr p adj
## 2-1  2.169472  2.039764  2.299181     0
## 3-1  3.923305  3.793149  4.053461     0
## 4-1  6.269979  6.139944  6.400014     0
## 5-1 10.221173 10.052639 10.389707     0
## 3-2  1.753832  1.708416  1.799249     0
## 4-2  4.100507  4.055437  4.145577     0
## 5-2  8.051701  7.935400  8.168002     0
## 4-3  2.346674  2.300332  2.393016     0
## 5-3  6.297868  6.181068  6.414668     0
## 5-4  3.951194  3.834528  4.067860     0
controls2 = controls[,c ("braintype", "AQ_full", "SPQ_full" )]
controls2 =  controls2[!is.na(controls2$AQ_full),]
controls2_melt =  melt(controls2, id.vars=c("braintype"))


ddply(controls2_melt, c("braintype", "variable"), summarise,
      mean = mean(value), sd = sd(value),
      sem = sd(value)/sqrt(length(value)))
##    braintype variable       mean       sd         sem
## 1          1  AQ_full  0.9918219 1.002425 0.008723006
## 2          1 SPQ_full 10.3633197 5.345941 0.046519871
## 3          2  AQ_full  1.9358574 1.432427 0.003088540
## 4          2 SPQ_full 12.5327919 5.241795 0.011302144
## 5          3  AQ_full  3.1070938 1.744665 0.003980184
## 6          3 SPQ_full 14.2866243 5.244543 0.011964615
## 7          4  AQ_full  4.8573653 2.135523 0.004801244
## 8          4 SPQ_full 16.6332986 5.410782 0.012164928
## 9          5  AQ_full  7.2988726 1.953192 0.015125123
## 10         5 SPQ_full 20.5844927 5.469007 0.042350893

Sex differences

sexdiff = matrix(c(18230, 241604, 18515, 393930), ncol = 2, byrow = TRUE)

colnames(sexdiff) = c("Cases", "Controls")

rownames(sexdiff) = c("Males", "Females")

sexdiff
##         Cases Controls
## Males   18230   241604
## Females 18515   393930
chisq.test(sexdiff)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  sexdiff
## X-squared = 1969.5, df = 1, p-value < 2.2e-16

Additional analyses

Other sex

data3 = subset(data, repeat. == 0) #695166

data3$autism = ifelse(data3$diagnosis_0 == "2" | data3$diagnosis_1 == "2"| data3$diagnosis_3 == "2" | data3$diagnosis_4 == "2" | data3$diagnosis_5 == "2" | data3$diagnosis_6 == "2" | data3$diagnosis_7 == "2" | data3$diagnosis_8 == "2" | data3$autism_diagnosis_0 > 0 | data3$autism_diagnosis_1 > 0 | data3$autism_diagnosis_2 > 0, 1, 0)

data3$autism[is.na(data3$autism)] <- 0

cases_2 = subset(data3, autism == "1")
controls_2 = subset(data3, autism == "0")

controls1 = subset(controls_2, sex == "1")
controls2 = subset(controls_2, sex == "2")
controls3 = subset(controls_2, sex == "3")

a = nrow(controls1)

b = nrow(controls2)

c = nrow(controls3)

cases1 = subset(cases_2, sex == "1")
cases2 = subset(cases_2, sex == "2")
cases3 = subset(cases_2, sex == "3")

d = nrow(cases1)

e = nrow(cases2)

f = nrow(cases3)

sex_3_way = matrix(c(a, d, b, e, c, f ), ncol = 2, byrow = TRUE)

colnames(sex_3_way) = c("Controls", "Cases")

rownames(sex_3_way) = c("Males", "Females", "Other")

sex_3_way
##         Controls Cases
## Males     241604 18230
## Females   393930 18515
## Other       2875   978
chisq.test(sex_3_way)
## 
##  Pearson's Chi-squared test
## 
## data:  sex_3_way
## X-squared = 4817.1, df = 2, p-value < 2.2e-16
sex_2_way = matrix(c(d+e, a+b, f, c ), ncol = 2, byrow = TRUE)

colnames(sex_2_way) = c("Cases", "Controls")

rownames(sex_2_way) = c("Binary", "Non-binary")

sex_2_way
##            Cases Controls
## Binary     36745   635534
## Non-binary   978     2875
chisq.test(sex_2_way)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  sex_2_way
## X-squared = 2881.1, df = 1, p-value < 2.2e-16

ANOVA with handedness in controls

#ANOVA with handedness
###AQ###

cat("handedness AQ results")
## handedness AQ results
m = aov(data=controls, AQ_full~as.character(handedness))
anova(m)
## Analysis of Variance Table
## 
## Response: AQ_full
##                              Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.character(handedness)      3   11032  3677.3  725.52 < 2.2e-16 ***
## Residuals                634942 3218199     5.1                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD((m), "as.character(handedness)")
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = AQ_full ~ as.character(handedness), data = controls)
## 
## $`as.character(handedness)`
##            diff         lwr        upr    p adj
## 1-0 -1.15234475 -1.42125375 -0.8834358 0.00e+00
## 2-0 -1.06600812 -1.33564396 -0.7963723 0.00e+00
## 3-0 -0.47424075 -0.74570118 -0.2027803 4.25e-05
## 2-1  0.08633663  0.06362319  0.1090501 0.00e+00
## 3-1  0.67810400  0.63933311  0.7168749 0.00e+00
## 3-2  0.59176738  0.54824016  0.6352946 0.00e+00
tky = as.data.frame(TukeyHSD(m)$"as.character(handedness)")
tky$pair = rownames(tky)
tky$`p adj` = ifelse(tky$`p adj` == 0, 2.2e-16, tky$`p adj`)


# Plot pairwise TukeyHSD comparisons and color by significance level
ggplot(tky, aes(colour=cut(`p adj`, c(0, 0.01, 0.05, 1), 
                           label=c("p<0.01","p<0.05","Non-Sig")))) +
  geom_hline(yintercept=0, lty="11", colour="grey30") +
  geom_errorbar(aes(pair, ymin=lwr, ymax=upr), width=0.2) +
  geom_point(aes(pair, diff)) +
  labs(colour="") +labs(x = "Difference", y = "Comparisons") + theme(legend.position="none") + theme_minimal()

###EQ###

cat("handedness EQ results")
## handedness EQ results
m = aov(data=controls, EQ_full~as.character(handedness))
anova(m)
## Analysis of Variance Table
## 
## Response: EQ_full
##                              Df   Sum Sq Mean Sq F value    Pr(>F)    
## as.character(handedness)      3    17894  5964.6  248.36 < 2.2e-16 ***
## Residuals                634942 15249059    24.0                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD((m), "as.character(handedness)")
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = EQ_full ~ as.character(handedness), data = controls)
## 
## $`as.character(handedness)`
##           diff        lwr        upr p adj
## 1-0  1.9037164  1.3183599  2.4890729 0e+00
## 2-0  1.6122419  1.0253033  2.1991806 0e+00
## 3-0  1.1732242  0.5823138  1.7641346 2e-06
## 2-1 -0.2914745 -0.3409167 -0.2420323 0e+00
## 3-1 -0.7304922 -0.8148880 -0.6460964 0e+00
## 3-2 -0.4390177 -0.5337670 -0.3442684 0e+00
tky = as.data.frame(TukeyHSD(m)$"as.character(handedness)")
tky$pair = rownames(tky)
tky$`p adj` = ifelse(tky$`p adj` == 0, 2.2e-16, tky$`p adj`)


# Plot pairwise TukeyHSD comparisons and color by significance level
ggplot(tky, aes(colour=cut(`p adj`, c(0, 0.01, 0.05, 1), 
                           label=c("p<0.01","p<0.05","Non-Sig")))) +
  geom_hline(yintercept=0, lty="11", colour="grey30") +
  geom_errorbar(aes(pair, ymin=lwr, ymax=upr), width=0.2) +
  geom_point(aes(pair, diff)) +
  labs(colour="") +labs(x = "Difference", y = "Comparisons") + theme(legend.position="none") + theme_minimal()

###SQ###
cat("handedness SQ results")
## handedness SQ results
m = aov(data=controls, SQ_full~as.character(handedness))
anova(m)
## Analysis of Variance Table
## 
## Response: SQ_full
##                              Df   Sum Sq Mean Sq F value    Pr(>F)    
## as.character(handedness)      3   105051   35017  2163.5 < 2.2e-16 ***
## Residuals                634942 10276637      16                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD((m), "as.character(handedness)")
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = SQ_full ~ as.character(handedness), data = controls)
## 
## $`as.character(handedness)`
##           diff         lwr        upr     p adj
## 1-0 -2.2470011 -2.72753541 -1.7664667 0.0000000
## 2-0 -2.1402850 -2.62211815 -1.6584518 0.0000000
## 3-0 -0.0968786 -0.58197230  0.3882151 0.9560049
## 2-1  0.1067161  0.06612772  0.1473045 0.0000000
## 3-1  2.1501225  2.08083975  2.2194052 0.0000000
## 3-2  2.0434064  1.96562420  2.1211885 0.0000000
tky = as.data.frame(TukeyHSD(m)$"as.character(handedness)")
tky$pair = rownames(tky)
tky$`p adj` = ifelse(tky$`p adj` == 0, 2.2e-16, tky$`p adj`)


# Plot pairwise TukeyHSD comparisons and color by significance level
ggplot(tky, aes(colour=cut(`p adj`, c(0, 0.01, 0.05, 1), 
                           label=c("p<0.01","p<0.05","Non-Sig")))) +
  geom_hline(yintercept=0, lty="11", colour="grey30") +
  geom_errorbar(aes(pair, ymin=lwr, ymax=upr), width=0.2) +
  geom_point(aes(pair, diff)) +
  labs(colour="") +labs(x = "Difference", y = "Comparisons") + theme(legend.position="none") + theme_minimal()

###SPQ##
cat("handedness SPQ results")
## handedness SPQ results
m = aov(data=controls, SPQ_full~as.character(handedness))
anova(m)
## Analysis of Variance Table
## 
## Response: SPQ_full
##                              Df   Sum Sq Mean Sq F value    Pr(>F)    
## as.character(handedness)      3   199691   66564  2087.3 < 2.2e-16 ***
## Residuals                634942 20247766      32                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD((m), "as.character(handedness)")
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = SPQ_full ~ as.character(handedness), data = controls)
## 
## $`as.character(handedness)`
##           diff        lwr          upr     p adj
## 1-0 -1.6009632 -2.2754718 -0.926454634 0.0000000
## 2-0 -1.6504084 -2.3267401 -0.974076690 0.0000000
## 3-0  1.3721611  0.6912527  2.053069447 0.0000013
## 2-1 -0.0494452 -0.1064176  0.007527249 0.1152571
## 3-1  2.9731243  2.8758746  3.070373908 0.0000000
## 3-2  3.0225695  2.9133895  3.131749456 0.0000000
tky = as.data.frame(TukeyHSD(m)$"as.character(handedness)")
tky$pair = rownames(tky)
tky$`p adj` = ifelse(tky$`p adj` == 0, 2.2e-16, tky$`p adj`)


# Plot pairwise TukeyHSD comparisons and color by significance level
ggplot(tky, aes(colour=cut(`p adj`, c(0, 0.01, 0.05, 1), 
                           label=c("p<0.01","p<0.05","Non-Sig")))) +
  geom_hline(yintercept=0, lty="11", colour="grey30") +
  geom_errorbar(aes(pair, ymin=lwr, ymax=upr), width=0.2) +
  geom_point(aes(pair, diff)) +
  labs(colour="") +labs(x = "Difference", y = "Comparisons") + theme(legend.position="none") + theme_minimal()

Generating plots

multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

data2$category = ifelse(data2$sex == 1 & data2$autism == 0, "control_males", "NA")
data2$category = ifelse(data2$sex == 2 & data2$autism == 0, "control_females", data2$category)
data2$category = ifelse(data2$sex == 2 & data2$autism == 1, "autism_females", data2$category)
data2$category = ifelse(data2$sex == 1 & data2$autism == 1, "autism_males", data2$category)

#CDFfigure
cdf_plot = ggplot(data2, aes(wheelwrightD, colour = category)) + stat_ecdf() +xlab("Dscore") + ylab("Cumulative frequency") + theme_classic()

#Case-control by sex Figure

a = ggplot(data2, aes(x=AQ_full, colour=as.character(category))) +   geom_density(adjust = 4) + theme_classic() + xlab("AQ-10 scores") + scale_x_continuous(breaks=seq(0,11,2))
b = ggplot(data2, aes(x=EQ_full, colour=as.character(category))) +   geom_density(adjust = 2) + theme_classic() + xlab("EQ-10 scores") + scale_x_continuous(breaks=seq(0,41,2))
c = ggplot(data2, aes(x=SQ_full, colour=as.character(category))) +   geom_density(adjust = 2) + theme_classic() + xlab("SQ-10 scores") + scale_x_continuous(breaks=seq(0,41,2))
d = ggplot(data2, aes(x=SPQ_full, colour=as.character(category))) +   geom_density(adjust = 2) + theme_classic() + xlab("SPQ-10 scores") + scale_x_continuous(breaks=seq(0,31,5))
case_control_plot = multiplot(a, b, c, d, cols=2)

#STEM figure
controls2 = controls[!is.na(controls$STEM),]
controls2$category = ifelse(controls2$sex == 1 & controls2$STEM == 0, "nonSTEM_males", "NA")
controls2$category = ifelse(controls2$sex == 2 & controls2$STEM == 0, "nonSTEM_females", controls2$category)
controls2$category = ifelse(controls2$sex == 1 & controls2$STEM == 1, "STEM_males", controls2$category)
controls2$category = ifelse(controls2$sex == 2 & controls2$STEM == 1, "STEM_females", controls2$category)

a = ggplot(controls2, aes(x=AQ_full, colour=as.character(category))) +   geom_density(adjust = 4) + theme_classic() + xlab("AQ-10 scores") + scale_x_continuous(breaks=seq(0,11,2))
b = ggplot(controls2, aes(x=EQ_full, colour=as.character(category))) +   geom_density(adjust = 2) + theme_classic() + xlab("EQ-10 scores") + scale_x_continuous(breaks=seq(0,41,2))
c = ggplot(controls2, aes(x=SQ_full, colour=as.character(category))) +   geom_density(adjust = 2) + theme_classic() + xlab("SQ-10 scores") + scale_x_continuous(breaks=seq(0,41,2))
d = ggplot(controls2, aes(x=SPQ_full, colour=as.character(category))) +   geom_density(adjust = 2) + theme_classic() + xlab("SPQ-10 scores") + scale_x_continuous(breaks=seq(0,31,5))
stem_diff_plot = multiplot(a, b, c, d, cols=2)